In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier 
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.tree import plot_tree

# Load the dataset
df = pd.read_csv('C:\datasets\coronavirusdataset.csv')
# Examine the structure of the dataset
print(df.head()) 
print(df.info())
   batch_date                     test_name swab_type covid19_test_results  \
0  2020-10-20               SARS-CoV-2, NAA     Nasal             Negative   
1  2020-10-20  COVID-19 PCR External Result     Nasal             Negative   
2  2020-10-20       Rapid COVID-19 PCR Test     Nasal             Negative   
3  2020-10-20       Rapid COVID-19 PCR Test     Nasal             Negative   
4  2020-10-20       Rapid COVID-19 PCR Test     Nasal             Negative   

   age  high_risk_exposure_occupation high_risk_interactions  diabetes    chd  \
0   39                          False                    NaN     False  False   
1   56                          False                    NaN     False  False   
2   35                          False                    NaN     False  False   
3   37                          False                    NaN     False  False   
4   42                          False                    NaN     False  False   

     htn  ...  headache  loss_of_smell  loss_of_taste  runny_nose  \
0  False  ...     False          False          False       False   
1  False  ...     False          False          False       False   
2  False  ...     False          False          False       False   
3  False  ...     False          False          False       False   
4  False  ...     False          False          False       False   

   muscle_sore  sore_throat  cxr_findings  cxr_impression  cxr_label  cxr_link  
0        False        False           NaN             NaN        NaN       NaN  
1        False        False           NaN             NaN        NaN       NaN  
2        False        False           NaN             NaN        NaN       NaN  
3        False        False           NaN             NaN        NaN       NaN  
4        False        False           NaN             NaN        NaN       NaN  

[5 rows x 45 columns]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7294 entries, 0 to 7293
Data columns (total 45 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   batch_date                     7294 non-null   object 
 1   test_name                      7294 non-null   object 
 2   swab_type                      7294 non-null   object 
 3   covid19_test_results           7294 non-null   object 
 4   age                            7294 non-null   int64  
 5   high_risk_exposure_occupation  7294 non-null   bool   
 6   high_risk_interactions         2727 non-null   object 
 7   diabetes                       7294 non-null   bool   
 8   chd                            7294 non-null   bool   
 9   htn                            7294 non-null   bool   
 10  cancer                         7294 non-null   bool   
 11  asthma                         7294 non-null   bool   
 12  copd                           7294 non-null   bool   
 13  autoimmune_dis                 7294 non-null   bool   
 14  smoker                         7294 non-null   bool   
 15  temperature                    1869 non-null   float64
 16  pulse                          1866 non-null   float64
 17  sys                            1727 non-null   float64
 18  dia                            1727 non-null   float64
 19  rr                             1544 non-null   float64
 20  sats                           1869 non-null   float64
 21  rapid_flu_results              6 non-null      object 
 22  rapid_strep_results            11 non-null     object 
 23  ctab                           1288 non-null   object 
 24  labored_respiration            1963 non-null   object 
 25  rhonchi                        723 non-null    object 
 26  wheezes                        961 non-null    object 
 27  days_since_symptom_onset       591 non-null    float64
 28  cough                          7294 non-null   bool   
 29  cough_severity                 178 non-null    object 
 30  fever                          3137 non-null   object 
 31  sob                            7294 non-null   bool   
 32  sob_severity                   82 non-null     object 
 33  diarrhea                       7294 non-null   bool   
 34  fatigue                        7294 non-null   bool   
 35  headache                       7294 non-null   bool   
 36  loss_of_smell                  7294 non-null   bool   
 37  loss_of_taste                  7294 non-null   bool   
 38  runny_nose                     7294 non-null   bool   
 39  muscle_sore                    7294 non-null   bool   
 40  sore_throat                    7294 non-null   bool   
 41  cxr_findings                   7 non-null      object 
 42  cxr_impression                 7 non-null      object 
 43  cxr_label                      7 non-null      object 
 44  cxr_link                       7 non-null      object 
dtypes: bool(19), float64(7), int64(1), object(18)
memory usage: 1.6+ MB
None
In [6]:
# Handling missing values
df.fillna(0, inplace=True) 
# Dropping irrelevant columns
df.drop(columns=['batch_date', 'test_name', 'swab_type'], inplace=True)
# Convert 'covid19_test_results' to numerical format (assuming it contains 'Positive' and 'Negative')
df['covid19_test_results'] = df['covid19_test_results'].map({'Positive': 1, 'Negative': 0})
# For numerical columns, we can fill missing values with the mean of the column
numerical_columns = df.select_dtypes(include=['float64']).columns
df[numerical_columns] = df[numerical_columns].fillna(df[numerical_columns].mean())
# For categorical columns, we can fill missing values with a default category or the most frequent category
categorical_columns = df.select_dtypes(include=['object']).columns
df[categorical_columns] = df[categorical_columns].fillna('Unknown')
# Convert categorical columns to numerical using one-hot encoding
df = pd.get_dummies(df, columns=categorical_columns, drop_first=True)
C:\Users\ENVIRONMENT-5\anaconda3\lib\site-packages\pandas\core\algorithms.py:798: FutureWarning: In a future version, the Index constructor will not infer numeric dtypes when passed object-dtype sequences (matching Series behavior)
  uniques = Index(uniques)
In [7]:
# Split the dataset into features (X) and target (y)
X = df.drop(columns=['covid19_test_results'])
y = df['covid19_test_results']
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Create and train the Random Forest Classifier
model =  RandomForestClassifier(n_estimators=100, random_state=0)
model.fit(X_train, y_train)
# Make predictions on the test set
print("Accuracy on training dataset",model.score(X_train, y_train))
print("Accuracy on testing dataset",model.score(X_test, y_test))
Accuracy on training dataset 0.9988003427592117
Accuracy on testing dataset 0.997943797121316
In [8]:
# Create and train the Random Forest Classifier
model =  RandomForestClassifier(n_estimators=100, random_state=0, max_depth=1)
model.fit(X_train, y_train)
# Make predictions on the test set
print("Accuracy on training dataset",model.score(X_train, y_train))
print("Accuracy on testing dataset",model.score(X_test, y_test))
Accuracy on training dataset 0.9962296486718081
Accuracy on testing dataset 0.997943797121316
In [9]:
print("Feature importance: \n", model.feature_importances_)
Feature importance: 
 [0.04 0.01 0.05 0.07 0.02 0.   0.   0.   0.   0.   0.1  0.05 0.02 0.03
 0.01 0.06 0.04 0.   0.01 0.04 0.03 0.03 0.1  0.04 0.04 0.02 0.06 0.01
 0.   0.   0.   0.   0.02 0.01 0.   0.02 0.   0.06 0.   0.01 0.   0.
 0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
 0.   0.   0.   0.   0.   0.  ]
In [10]:
import matplotlib.pyplot as plt
import numpy as np
# Create a figure and axis
fig, ax = plt.subplots(figsize=(150, 100))
# Get the number of features
n_features = X_train.shape[1]
# Plot the feature importances
ax.barh(range(n_features), model.feature_importances_, align="center")
# Set the y-ticks to be the feature names
ax.set_yticks(np.arange(n_features))
ax.set_yticklabels(X.columns)
# Label the x-axis
ax.set_xlabel('Feature Importance')
# Show the plot
plt.show()
In [11]:
# Split the dataset into features (X) and target (y)
X = df.drop(columns=['covid19_test_results'])
y = df['covid19_test_results']
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Create and train the Gradient Boosting Classifier
model =  GradientBoostingClassifier(n_estimators=100, random_state=0)
model.fit(X_train, y_train)
# Make predictions on the test set
print("Accuracy on training dataset",model.score(X_train, y_train))
print("Accuracy on testing dataset",model.score(X_test, y_test))
Accuracy on training dataset 0.9984575835475579
Accuracy on testing dataset 0.995887594242632
In [12]:
# Create and train the Gradient Boosting Classifier
model =  GradientBoostingClassifier(n_estimators=100, random_state=0, max_depth=1)
model.fit(X_train, y_train)
# Make predictions on the test set
print("Accuracy on training dataset",model.score(X_train, y_train))
print("Accuracy on testing dataset",model.score(X_test, y_test))
Accuracy on training dataset 0.9965724078834619
Accuracy on testing dataset 0.997943797121316
In [13]:
# Create and train the Gradient Boosting Classifier
model =  GradientBoostingClassifier(n_estimators=100, random_state=0, learning_rate=0.01, max_depth=1)
model.fit(X_train, y_train)
# Make predictions on the test set
print("Accuracy on training dataset",model.score(X_train, y_train))
print("Accuracy on testing dataset",model.score(X_test, y_test))
Accuracy on training dataset 0.996401028277635
Accuracy on testing dataset 0.9972583961617546
In [14]:
print("Feature importance: \n", model.feature_importances_)
Feature importance: 
 [0.        0.        0.        0.        0.        0.        0.
 0.        0.        0.        0.7774512 0.        0.        0.
 0.        0.        0.        0.        0.        0.        0.
 0.        0.2225488 0.        0.        0.        0.        0.
 0.        0.        0.        0.        0.        0.        0.
 0.        0.        0.        0.        0.        0.        0.
 0.        0.        0.        0.        0.        0.        0.
 0.        0.        0.        0.        0.        0.        0.
 0.        0.        0.        0.        0.        0.       ]
In [15]:
import matplotlib.pyplot as plt
import numpy as np
# Create a figure and axis
fig, ax = plt.subplots(figsize=(150, 100))
# Get the number of features
n_features = X_train.shape[1]
# Plot the feature importances
ax.barh(range(n_features), model.feature_importances_, align="center")
# Set the y-ticks to be the feature names
ax.set_yticks(np.arange(n_features))
ax.set_yticklabels(X.columns)
# Label the x-axis
ax.set_xlabel('Feature Importance')
# Show the plot
plt.show()
In [16]:
# Create and train the Decision Tree Classifier
model = DecisionTreeClassifier(random_state=0)
model.fit(X_train, y_train)
# Make predictions on the test set
print("Accuracy on training dataset",model.score(X_train, y_train))
print("Accuracy on testing dataset",model.score(X_test, y_test))
Accuracy on training dataset 0.9988003427592117
Accuracy on testing dataset 0.9890335846470185
In [17]:
# Create and train the Decision Tree Classifier
model = DecisionTreeClassifier(random_state=0, max_depth=1)
model.fit(X_train, y_train)
# Make predictions on the test set
print("Accuracy on training dataset",model.score(X_train, y_train))
print("Accuracy on testing dataset",model.score(X_test, y_test))
Accuracy on training dataset 0.996401028277635
Accuracy on testing dataset 0.9972583961617546
In [18]:
print("Feature importance: \n", model.feature_importances_)
Feature importance: 
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
In [22]:
from sklearn import tree

fig = plt.figure(figsize=(25, 15))
_ = tree.plot_tree(model, filled=True, rounded=True,
                   feature_names=X.columns,
                   class_names=["age", "temperature"])

plt.show()
In [21]:
import matplotlib.pyplot as plt
import numpy as np
# Create a figure and axis
fig, ax = plt.subplots(figsize=(150, 100))
# Get the number of features
n_features = X_train.shape[1]
# Plot the feature importances
ax.barh(range(n_features), model.feature_importances_, align="center")
# Set the y-ticks to be the feature names
ax.set_yticks(np.arange(n_features))
ax.set_yticklabels(X.columns)
# Label the x-axis
ax.set_xlabel('Feature Importance')
# Show the plot
plt.show()
In [ ]: